Contents

%run set_theme.ipynb
import pandas as pd
import plotly.graph_objs as go
import plotly.express as px
from plotly.offline import init_notebook_mode

init_notebook_mode()
df = pd.read_parquet('../data/SO_2014_2022.pq')

df = df[(df['Salary'] > 0) & (df['Salary'] < 250000)]

df.head()
Year Salary JobSat YearsCode YearsCodePro Age Education OrgSize LastNewJob Employment RespondentType JobSeek Gender Student Country CodingActivities DevType LearnCodeFrom LangPresent
0 2022 69318.0 <NA> 10 5 25-34 master 500 to 999 employees <NA> fulltime dev <NA> male no Germany School or academic work Data scientist or machine learning specialist;... Books / Physical media;School (i.e., Universit... C;C++;Java;JavaScript;MATLAB;Python;Scala;SQL;...
6 2022 27652.0 <NA> 18 10 25-34 bachelor 1,000 to 4,999 employees <NA> fulltime dev <NA> male no Colombia Hobby Developer, full-stack;Developer, back-end Books / Physical media;Other online resources ... Bash/Shell/PowerShell;Elixir;HTML/CSS;JavaScri...
9 2022 15431.0 <NA> 5 5 25-34 bachelor 20 to 99 employees <NA> fulltime dev <NA> male no Ghana Freelance/contract work Developer, back-end On the job training;Coding Bootcamp JavaScript;Ruby
13 2022 47352.0 <NA> 7 7 45-54 master 10 to 19 employees <NA> fulltime non-dev <NA> male no Belgium Hobby Developer, back-end;Educator or academic;Datab... Books / Physical media;On the job training;Col... Delphi;SQL
22 2022 78084.0 <NA> 25 25 45-54 bachelor 500 to 999 employees <NA> fulltime non-dev <NA> male no Canada Hobby;Contribute to open-source projects Engineer, site reliability;Security professional Books / Physical media;Other online resources ... Bash/Shell/PowerShell;C;JavaScript;Perl;PHP;Py...
# Show the top-n countries in the plot.
TOP_N_COUNTRIES = 10

# Take the top-n countries that have the most developers.
countries = df[['Country', 'RespondentType']] \
    .query('RespondentType == "dev"') \
    .query('Country != "Brazil"') \
    .query('Country != "India"') \
    .query('Country != "Russia"') \
    .query('Country != "Sweden"') \
    .groupby('Country', as_index=False) \
    .count() \
    .sort_values('RespondentType', ascending=False) \
    .head(TOP_N_COUNTRIES) \
    ['Country'] \
    .tolist()

countries_df = df[df['Country'].isin(countries)]
    
# Calculate the salary gap in percentage.
salary_men_df = countries_df.query('Gender == "male"') \
    .groupby(['Year', 'Country']) \
    .agg({'Salary': 'mean' }) \
    .reset_index() \
    .rename(columns={'Salary': 'SalaryMen'})

salary_women_df = countries_df.query('Gender == "female"') \
    .groupby(['Year', 'Country']) \
    .agg({'Salary': 'mean' }) \
    .reset_index() \
    .rename(columns={'Salary': 'SalaryWomen'})

salary_gap_df = pd.merge(salary_men_df, salary_women_df, on=['Year', 'Country']).dropna()
salary_gap_df['SalaryGap'] = abs(salary_gap_df['SalaryMen'] - salary_gap_df['SalaryWomen'])
salary_gap_df['SalaryGapPercent'] = (salary_gap_df['SalaryMen'] - salary_gap_df['SalaryWomen']) / salary_gap_df['SalaryMen'] * 100

salary_gap_df.head()
Year Country SalaryMen SalaryWomen SalaryGap SalaryGapPercent
0 2014 Australia 81355.140187 70000.0 11355.140187 13.957496
1 2014 Canada 71124.260355 57692.307692 13431.952663 18.885191
2 2014 France 54724.409449 47000.0 7724.409449 14.115108
3 2014 Germany 61491.596639 50714.285714 10777.310924 17.526478
4 2014 Italy 33923.076923 25000.0 8923.076923 26.303855
# Calculate the mean salary gap for all countries for all genders.
mean_salary_men_df = countries_df.query('Gender == "male"') \
    .groupby(['Year', 'Country']) \
    .agg({'Salary': 'mean' }) \
    .reset_index() \
    .rename(columns={'Salary': 'SalaryMen'})

mean_salary_women_df = countries_df.query('Gender == "female"') \
    .groupby(['Year', 'Country']) \
    .agg({'Salary': 'mean' }) \
    .reset_index() \
    .rename(columns={'Salary': 'SalaryWomen'})

mean_salary_gap_df = pd.merge(mean_salary_men_df, mean_salary_women_df, on=['Year', 'Country']).dropna()
mean_salary_gap_df['SalaryGap'] = abs(mean_salary_gap_df['SalaryMen'] - mean_salary_gap_df['SalaryWomen'])
mean_salary_gap_df['SalaryGapPercent'] = (mean_salary_gap_df['SalaryMen'] - mean_salary_gap_df['SalaryWomen']) / mean_salary_gap_df['SalaryMen'] * 100

mean_salary_gap = mean_salary_gap_df.groupby('Year', as_index=False)['SalaryGapPercent'].mean()

mean_salary_gap
Year SalaryGapPercent
0 2014 25.065
1 2015 23.649814
2 2016 18.958623
3 2017 14.428017
4 2018 14.927359
5 2019 16.585776
6 2020 18.866154
7 2021 21.314563
8 2022 16.113038
color_alpha = 0.7

fig = px.line(
    salary_gap_df,
    x='Year',
    y='SalaryGapPercent',
    color='Country',
    hover_name='Country',
    title='Top-10 Country Salary Gap Between Men and Women<br><sup>Generally decreasing, but still male-favoured</sup>',
    custom_data=['Country'],
    color_discrete_sequence=[
        f'rgba(233, 146, 119, {color_alpha})',
        f'rgba(148, 233, 119, {color_alpha})',
        f'rgba(119, 127, 233, {color_alpha})',
        f'rgba(218, 119, 233, {color_alpha})',
        f'rgba(218, 191, 162, {color_alpha})',
        f'rgba(207, 150, 150, {color_alpha})',
        f'rgba(167, 199, 213, {color_alpha})',
        f'rgba(233, 119, 186, {color_alpha})',
        f'rgba(154, 162, 221, {color_alpha})',
        f'rgba(154, 221, 209, {color_alpha})',
    ],
    range_y=[0, 48],
    width=790,
)

fig.update_layout(yaxis_title='Salary gap in percentage')

# Add male/female ratio in the legend names
total_male_participants_per_country_df = countries_df.query('Gender == "male"') \
    .groupby('Country', as_index=False) \
    .size() \
    .rename(columns={'size': 'TotalMale'})

total_female_participants_per_country_df = countries_df.query('Gender == "female"') \
    .groupby('Country', as_index=False) \
    .size() \
    .rename(columns={'size': 'TotalFemale'})

total_participants_per_country_df = pd.merge(total_male_participants_per_country_df, total_female_participants_per_country_df)
total_participants_per_country_df['MaleRatioPerFemale'] = total_participants_per_country_df['TotalMale'] / total_participants_per_country_df['TotalFemale']

# Add female/male ratio for each legend name.

# Example: {'oldname': 'newname'}
legend_names = {}
for _, row in total_participants_per_country_df.iterrows():
    legend_names[row['Country']] = f"{row['Country']} (1/{int(row['MaleRatioPerFemale'])})"

fig.for_each_trace(lambda t: t.update(name=legend_names[t.name], 
                                      legendgroup=legend_names[t.name],
                                      hovertemplate='<b>' + t.name + '</b><br>Year: %{x:d} </br>Salary gap: %{y:d}%<extra></extra>'))

# Plot the mean salary scatters
mean_salary_line_fig = px.line(mean_salary_gap,
                               x='Year',
                               y='SalaryGapPercent',
                               markers=True)
for trace in mean_salary_line_fig.data:
    trace.hovertemplate = '<b>' + trace.name + '</b><br>Year: %{x:d} </br>Mean salary gap: %{y:d}%<extra></extra>'

fig.add_traces(list(mean_salary_line_fig.select_traces()))
fig.add_annotation(x=2019.7, y=17.75,
                   showarrow=False,
                   xanchor='center',
                   yanchor='bottom',
                   textangle=-16,
                   text='mean salary gap')

# Add caption
fig.add_annotation(x=-0.11, y=-0.35,
                   xref='paper', yref='paper',
                   showarrow=False,
                   xanchor='left', yanchor='bottom',
                   align='left',
                   text='Salary gaps across ten of the countries with the most survey respondents.<br>Positive percentages indicates a male-favoured salary gap.')

fig.update_layout(
    legend={
        'title': 'Country with<br>female/male ratio<br>'
    },
    margin={'b': 130, 't': 100, 'l': 90}
)

fig.show()